#importing the libraries
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from scipy.stats import zscore
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
sns.set_style("darkgrid")
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
CONTEXT: The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes.
DATA DESCRIPTION: The data concerns city-cycle fuel consumption in miles per gallon.
Attribute Information:
1. mpg: continuous
2. cylinders: multi-valued discrete
3. displacement: continuous
4. horsepower: continuous
5. weight: continuous
6. acceleration: continuous
7. model year: multi-valued discrete
8. origin: multi-valued discrete
9. car name: string (unique for each instance)
PROJECT OBJECTIVE: Goal is to cluster the data and treat them as individual datasets to train Regression models to predict ‘mpg’.
1. Import and warehouse data:
• Import all the given datasets and explore shape and size.
• Merge all datasets onto one and explore final shape and size.
• Export the final dataset and store it on local machine in .csv, .xlsx and .json format for future use.
• Import the data from above steps into python.
#Importing the datasets.
part1a = pd.read_json("Part1 - Car-Attributes.json")
part1b = pd.read_csv("Part1 - Car name.csv")
#Exploring Shape of the datasets.
part1a.shape,part1b.shape
((398, 8), (398, 1))
print("The first dataset has", part1a.shape[0],"rows and", part1a.shape[1]," columns")
print("The second dataset has", part1b.shape[0],"rows and", part1b.shape[1]," columns")
The first dataset has 398 rows and 8 columns The second dataset has 398 rows and 1 columns
#Exploring size of datasets.
part1a.size , part1b.size
(3184, 398)
print ("The first dataset has",part1a.size,"elements")
print ("The second dataset has",part1b.size,"elements")
The first dataset has 3184 elements The second dataset has 398 elements
part1a.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
part1b.head()
| car_name | |
|---|---|
| 0 | chevrolet chevelle malibu |
| 1 | buick skylark 320 |
| 2 | plymouth satellite |
| 3 | amc rebel sst |
| 4 | ford torino |
#Merging into one dataset
part1 = pd.concat([part1a,part1b],axis=1)
#Checking the contents of the final dataset
part1.head()
| mpg | cyl | disp | hp | wt | acc | yr | origin | car_name | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
| 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
| 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
| 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
| 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | ford torino |
#Exploring the shape and size of the final dataset.
print("The final dataset has", part1.shape[0],"rows and", part1.shape[1]," columns")
print ("The final dataset has",part1.size,"elements")
The final dataset has 398 rows and 9 columns The final dataset has 3582 elements
part1.to_csv('Part1.csv') #Saving the final dataset as a csv file.
part1.to_excel('Part1.xlsx') #Saving the final dataset to excel file.
part1.to_json('Part1.json',orient="table") #Saving the final dataset as a json file.
part1f = pd.read_csv('Part1.csv')
part1f.head()
| Unnamed: 0 | mpg | cyl | disp | hp | wt | acc | yr | origin | car_name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
| 1 | 1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
| 2 | 2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
| 3 | 3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
| 4 | 4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | ford torino |
2. Data cleansing:
• Missing/incorrect value treatment
• Drop attribute/s if required using relevant functional knowledge
• Perform another kind of corrections/treatment on the data.
Missing/incorrect value treatment.
part1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 398 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 398 non-null float64 1 cyl 398 non-null int64 2 disp 398 non-null float64 3 hp 398 non-null object 4 wt 398 non-null int64 5 acc 398 non-null float64 6 yr 398 non-null int64 7 origin 398 non-null int64 8 car_name 398 non-null object dtypes: float64(3), int64(4), object(2) memory usage: 28.1+ KB
We can see that:
part1["hp"].unique()
array([130, 165, 150, 140, 198, 220, 215, 225, 190, 170, 160, 95, 97, 85,
88, 46, 87, 90, 113, 200, 210, 193, '?', 100, 105, 175, 153, 180,
110, 72, 86, 70, 76, 65, 69, 60, 80, 54, 208, 155, 112, 92, 145,
137, 158, 167, 94, 107, 230, 49, 75, 91, 122, 67, 83, 78, 52, 61,
93, 148, 129, 96, 71, 98, 115, 53, 81, 79, 120, 152, 102, 108, 68,
58, 149, 89, 63, 48, 66, 139, 103, 125, 133, 138, 135, 142, 77, 62,
132, 84, 64, 74, 116, 82], dtype=object)
We can see there are '?' in the column and hence the datatype as object.
temp = pd.DataFrame(part1.hp.str.isdigit()) #Checking the rows with faulty data.
temp[temp['hp'] == False]
| hp | |
|---|---|
| 32 | False |
| 126 | False |
| 330 | False |
| 336 | False |
| 354 | False |
| 374 | False |
part1['hp'] = part1['hp'].replace('?', np.nan)
part1.isnull().sum()
mpg 0 cyl 0 disp 0 hp 6 wt 0 acc 0 yr 0 origin 0 car_name 0 dtype: int64
Drop attribute/s if required using relevant functional knowledge
We can drop these rows since it will only make us miss some part of the dataset.
part1.dropna(axis=0, inplace=True) #since there are only 6 rows with nan values we can delete the data.
part1.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 392 entries, 0 to 397 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mpg 392 non-null float64 1 cyl 392 non-null int64 2 disp 392 non-null float64 3 hp 392 non-null float64 4 wt 392 non-null int64 5 acc 392 non-null float64 6 yr 392 non-null int64 7 origin 392 non-null int64 8 car_name 392 non-null object dtypes: float64(4), int64(4), object(1) memory usage: 30.6+ KB
Perform another kind of corrections/treatment on the data.
print(f'total duplicate rows: {part1.duplicated().sum()}') #checking duplicate values
total duplicate rows: 0
part1['origin'].value_counts() #checking the counts for categorical variables
1 245 3 79 2 68 Name: origin, dtype: int64
part1['cyl'].value_counts()
4 199 8 103 6 83 3 4 5 3 Name: cyl, dtype: int64
3. Data analysis & visualisation: [ Score: 4 points ]
• Perform detailed statistical analysis on the data.
• Perform a detailed univariate, bivariate and multivariate analysis with appropriate detailed comments after each analysis.
Perform detailed statistical analysis on the data.
part1.describe(include ="all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| mpg | 392.0 | NaN | NaN | NaN | 23.445918 | 7.805007 | 9.0 | 17.0 | 22.75 | 29.0 | 46.6 |
| cyl | 392.0 | NaN | NaN | NaN | 5.471939 | 1.705783 | 3.0 | 4.0 | 4.0 | 8.0 | 8.0 |
| disp | 392.0 | NaN | NaN | NaN | 194.41199 | 104.644004 | 68.0 | 105.0 | 151.0 | 275.75 | 455.0 |
| hp | 392.0 | NaN | NaN | NaN | 104.469388 | 38.49116 | 46.0 | 75.0 | 93.5 | 126.0 | 230.0 |
| wt | 392.0 | NaN | NaN | NaN | 2977.584184 | 849.40256 | 1613.0 | 2225.25 | 2803.5 | 3614.75 | 5140.0 |
| acc | 392.0 | NaN | NaN | NaN | 15.541327 | 2.758864 | 8.0 | 13.775 | 15.5 | 17.025 | 24.8 |
| yr | 392.0 | NaN | NaN | NaN | 75.979592 | 3.683737 | 70.0 | 73.0 | 76.0 | 79.0 | 82.0 |
| origin | 392.0 | NaN | NaN | NaN | 1.576531 | 0.805518 | 1.0 | 1.0 | 1.0 | 2.0 | 3.0 |
| car_name | 392 | 301 | amc matador | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
We can see that:
Univariate Analysis
Numerical Columns
col2 = ['mpg', 'disp', 'hp', 'wt', 'acc']
for i in col2:
f, axes = plt.subplots(1, 2, figsize=(10,5))
sns.distplot(part1[i], ax=axes[0],color = 'forestgreen')
sns.boxplot(x = i, data=part1, orient='h' , ax=axes[1],color = 'darkseagreen')
axes[0].set_title('Distribution plot of {}'.format(i))
axes[1].set_title('Box plot of {}'.format(i))
plt.show()
#checking count of outliers.
q25,q75=np.percentile(part1[i],25),np.percentile(part1[i],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[j for j in part1[i] if j < lower or j > upper]
b=len(Outliers)
print('Total Number of outliers in {} {}'.format(i,b))
Total Number of outliers in mpg 0
Total Number of outliers in disp 0
Total Number of outliers in hp 10
Total Number of outliers in wt 0
Total Number of outliers in acc 11
mpg
disp
hp
wt
acc
Categorical Columns
#Creating an artificial categorical attribute named mpg_level which categorizes mpg into low, medium and high.
#to understand the distribution better.
part1['mpg_level'] = part1['mpg'].apply(lambda x: 'low' if x<17 else 'high' if x>29 else 'medium')
#Checking the categories for all variables.
print(f"categories in origin: {pd.unique(part1['origin'])}")
print(f"categories in cylinders: {pd.unique(part1['cyl'])}")
print(f"categories in model_year: {pd.unique(part1['yr'])}")
print(f"categories in mpg_level: {pd.unique(part1['mpg_level'])}")
categories in origin: [1 3 2] categories in cylinders: [8 4 6 3 5] categories in model_year: [70 71 72 73 74 75 76 77 78 79 80 81 82] categories in mpg_level: ['medium' 'low' 'high']
col2 = ['origin', 'cyl','yr','mpg_level']
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']
for i in col2:
f, axes = plt.subplots(1, 2, figsize=(10,5))
part1[i].value_counts().plot.bar(ax=axes[0],color = 'darkseagreen')
part1[i].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[1],colors=colors)
axes[0].set_title('Bar plot of {}'.format(i))
axes[1].set_title('Pie Chart for {}'.format(i))
plt.show()
origin
cyl
yr
mpg_level
We can notice that the car_name column has a compnay name as a prefix,so maybe it will be fruitful to extract them as separate feature and do analysis on that.
part1['car_company'] = part1['car_name'].apply(lambda x: x.split()[0])
print(f'total unique categories in `car_company`: {part1.car_company.nunique()}')
print(f"\nunique categories in `car_company`:\n\n {part1.car_company.unique()}")
total unique categories in `car_company`: 37 unique categories in `car_company`: ['chevrolet' 'buick' 'plymouth' 'amc' 'ford' 'pontiac' 'dodge' 'toyota' 'datsun' 'volkswagen' 'peugeot' 'audi' 'saab' 'bmw' 'chevy' 'hi' 'mercury' 'opel' 'fiat' 'oldsmobile' 'chrysler' 'mazda' 'volvo' 'renault' 'toyouta' 'maxda' 'honda' 'subaru' 'chevroelt' 'capri' 'vw' 'mercedes-benz' 'cadillac' 'mercedes' 'vokswagen' 'triumph' 'nissan']
fig = plt.figure(1, (18, 4))
ax1 = plt.subplot(1,1,1)
sns.countplot(part1['car_company'], order=part1['car_company'].value_counts().index)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=75)
plt.show()
Bivariate Analysis
fig = plt.figure(1, (18,4))
sns.countplot(x='yr', hue='origin', data=part1)
plt.show()
fig = plt.figure(1, (18,4))
sns.countplot(x='yr', hue='cyl', data=part1)
plt.show()
Multivariate Analysis
sns.pairplot(part1, diag_kind='kde')
<seaborn.axisgrid.PairGrid at 0x1fa078459d0>
Insights
part1.corr()
| mpg | cyl | disp | hp | wt | acc | yr | origin | |
|---|---|---|---|---|---|---|---|---|
| mpg | 1.000000 | -0.777618 | -0.805127 | -0.778427 | -0.832244 | 0.423329 | 0.580541 | 0.565209 |
| cyl | -0.777618 | 1.000000 | 0.950823 | 0.842983 | 0.897527 | -0.504683 | -0.345647 | -0.568932 |
| disp | -0.805127 | 0.950823 | 1.000000 | 0.897257 | 0.932994 | -0.543800 | -0.369855 | -0.614535 |
| hp | -0.778427 | 0.842983 | 0.897257 | 1.000000 | 0.864538 | -0.689196 | -0.416361 | -0.455171 |
| wt | -0.832244 | 0.897527 | 0.932994 | 0.864538 | 1.000000 | -0.416839 | -0.309120 | -0.585005 |
| acc | 0.423329 | -0.504683 | -0.543800 | -0.689196 | -0.416839 | 1.000000 | 0.290316 | 0.212746 |
| yr | 0.580541 | -0.345647 | -0.369855 | -0.416361 | -0.309120 | 0.290316 | 1.000000 | 0.181528 |
| origin | 0.565209 | -0.568932 | -0.614535 | -0.455171 | -0.585005 | 0.212746 | 0.181528 | 1.000000 |
sns.heatmap(part1.corr())
<AxesSubplot:>
part1.corr()['mpg'].sort_values()
wt -0.832244 disp -0.805127 hp -0.778427 cyl -0.777618 acc 0.423329 origin 0.565209 yr 0.580541 mpg 1.000000 Name: mpg, dtype: float64
We can see there is very high correlation between wt, disp,hp, cyl and mpg, just as we would expect.
4. Machine learning:
• Use K Means and Hierarchical clustering to find out the optimal number of clusters in the data.
• Share your insights about the difference in using these two methods.
part1 = pd.get_dummies(part1, columns=['origin'])
carH=part1.copy()
carK=part1.copy()
part1.head()
| mpg | cyl | disp | hp | wt | acc | yr | car_name | mpg_level | car_company | origin_1 | origin_2 | origin_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | chevrolet chevelle malibu | medium | chevrolet | 1 | 0 | 0 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | buick skylark 320 | low | buick | 1 | 0 | 0 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | plymouth satellite | medium | plymouth | 1 | 0 | 0 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | amc rebel sst | low | amc | 1 | 0 | 0 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | ford torino | medium | ford | 1 | 0 | 0 |
#separating numeric variables
cc = part1.iloc[:,0:7]
cc.head()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 |
#scaling the variable
cc_z = cc.apply(zscore)
cc_z.head()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 0 | -0.698638 | 1.483947 | 1.077290 | 0.664133 | 0.620540 | -1.285258 | -1.625315 |
| 1 | -1.083498 | 1.483947 | 1.488732 | 1.574594 | 0.843334 | -1.466724 | -1.625315 |
| 2 | -0.698638 | 1.483947 | 1.182542 | 1.184397 | 0.540382 | -1.648189 | -1.625315 |
| 3 | -0.955212 | 1.483947 | 1.048584 | 1.184397 | 0.536845 | -1.285258 | -1.625315 |
| 4 | -0.826925 | 1.483947 | 1.029447 | 0.924265 | 0.555706 | -1.829655 | -1.625315 |
#calculating pairwise distance using average linkage method
link_method = linkage(cc_z.iloc[:,0:7], method = 'average')
#plotting the H-cluster
plt.figure(figsize=(25, 10))
dendrogram(link_method)
plt.show()
Appers to be to much of a visual clutter, we'll go ahead and cut down the dendrogram to give us 2 clusters/groups
# dendrogram function to arrive at dendrogram
dendrogram(
link_method,
truncate_mode='lastp',
p=2,
)
plt.show()
#vieweing the clusters formed
clusters = fcluster(link_method, 2, criterion='maxclust')
clusters
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1,
2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)
#attaching the clusters formed to the scales data
cc_z['clusters_H'] = clusters
cc_z.head()
| mpg | cyl | disp | hp | wt | acc | yr | clusters_H | |
|---|---|---|---|---|---|---|---|---|
| 0 | -0.698638 | 1.483947 | 1.077290 | 0.664133 | 0.620540 | -1.285258 | -1.625315 | 1 |
| 1 | -1.083498 | 1.483947 | 1.488732 | 1.574594 | 0.843334 | -1.466724 | -1.625315 | 1 |
| 2 | -0.698638 | 1.483947 | 1.182542 | 1.184397 | 0.540382 | -1.648189 | -1.625315 | 1 |
| 3 | -0.955212 | 1.483947 | 1.048584 | 1.184397 | 0.536845 | -1.285258 | -1.625315 | 1 |
| 4 | -0.826925 | 1.483947 | 1.029447 | 0.924265 | 0.555706 | -1.829655 | -1.625315 | 1 |
#vieweing the distribution of clusters
cc_z.clusters_H.value_counts().sort_index()
1 99 2 293 Name: clusters_H, dtype: int64
#attaching the clusters formed to the original data
cc['clusters_H']=clusters
carH['clusters_H']=clusters
cc.head()
| mpg | cyl | disp | hp | wt | acc | yr | clusters_H | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 |
#create a new data set named Hclus
Hclus=cc
Hclus.head()
| mpg | cyl | disp | hp | wt | acc | yr | clusters_H | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 1 |
#aggregating the numerical variable with the clusters formed with the mean
aggdata=cc.iloc[:,0:8].groupby('clusters_H').mean()
aggdata['Freq']=cc.clusters_H.value_counts().sort_index()
aggdata
| mpg | cyl | disp | hp | wt | acc | yr | Freq | |
|---|---|---|---|---|---|---|---|---|
| clusters_H | ||||||||
| 1 | 14.653535 | 8.000000 | 346.626263 | 160.353535 | 4128.393939 | 12.694949 | 73.696970 | 99 |
| 2 | 26.416724 | 4.617747 | 142.981229 | 85.587031 | 2588.744027 | 16.503072 | 76.750853 | 293 |
Clearly shows two disting group with a difference in average between the clusters and variables
#plotting the clusters formed
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="clusters_H",
data=cc_z,
palette=['green','brown']);
K Means Clustering
#seperating the numeric values
cc = part1.iloc[:,0:7]
cc_z1 = cc.apply(zscore)
cc_z1.head()
| mpg | cyl | disp | hp | wt | acc | yr | |
|---|---|---|---|---|---|---|---|
| 0 | -0.698638 | 1.483947 | 1.077290 | 0.664133 | 0.620540 | -1.285258 | -1.625315 |
| 1 | -1.083498 | 1.483947 | 1.488732 | 1.574594 | 0.843334 | -1.466724 | -1.625315 |
| 2 | -0.698638 | 1.483947 | 1.182542 | 1.184397 | 0.540382 | -1.648189 | -1.625315 |
| 3 | -0.955212 | 1.483947 | 1.048584 | 1.184397 | 0.536845 | -1.285258 | -1.625315 |
| 4 | -0.826925 | 1.483947 | 1.029447 | 0.924265 | 0.555706 | -1.829655 | -1.625315 |
#calculatint the within sum of squares
wss =[]
for i in range(1,5):
KM = KMeans(n_clusters=i)
KM.fit(cc_z1)
wss.append(KM.inertia_)
wss
[2744.0000000000014, 1272.9007603914608, 928.9163493824311, 725.3271838226432]
#plotting the WSS against the number of cluster to come up with optimal number of clusters using Elbow-method
plt.plot(range(1,5), wss);
plt.title('Elbow Method');
plt.xlabel("Number of Clusters")
plt.ylabel("WSS");
#using 2 centroids for clustering
k_means = KMeans(n_clusters = 2)
k_means.fit(cc_z1)
labels = k_means.labels_
# Calculating silhouette_score
silhouette_score(cc_z1,labels)
0.48120917916406525
#calculating silhouette score for different centroids
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}
silhouette_coefficients = []
# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 7):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(cc_z1)
score = silhouette_score(cc_z1,kmeans.labels_)
silhouette_coefficients.append(score)
#plotting silhouette score for different centroids
plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
#attaching the labels to the datasets
cc["cluster_K"] = labels
carK['cluster_K']=labels
Kclus=cc
Kclus.head()
| mpg | cyl | disp | hp | wt | acc | yr | cluster_K | |
|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | 0 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | 0 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | 0 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | 0 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | 0 |
#vieweing the distribution of clusters
cc.cluster_K.value_counts().sort_index()
0 105 1 287 Name: cluster_K, dtype: int64
#attaching the clsuters to the scaled data
cc_z1["cluster_K"] = labels
cc_z1.head()
| mpg | cyl | disp | hp | wt | acc | yr | cluster_K | |
|---|---|---|---|---|---|---|---|---|
| 0 | -0.698638 | 1.483947 | 1.077290 | 0.664133 | 0.620540 | -1.285258 | -1.625315 | 0 |
| 1 | -1.083498 | 1.483947 | 1.488732 | 1.574594 | 0.843334 | -1.466724 | -1.625315 | 0 |
| 2 | -0.698638 | 1.483947 | 1.182542 | 1.184397 | 0.540382 | -1.648189 | -1.625315 | 0 |
| 3 | -0.955212 | 1.483947 | 1.048584 | 1.184397 | 0.536845 | -1.285258 | -1.625315 | 0 |
| 4 | -0.826925 | 1.483947 | 1.029447 | 0.924265 | 0.555706 | -1.829655 | -1.625315 | 0 |
#aggregating the numerical variable with the clusters formed with the mean
aggdata=cc.iloc[:,0:8].groupby('cluster_K').mean()
aggdata['Freq']=cc.cluster_K.value_counts().sort_index()
aggdata
| mpg | cyl | disp | hp | wt | acc | yr | Freq | |
|---|---|---|---|---|---|---|---|---|
| cluster_K | ||||||||
| 0 | 14.851429 | 7.923810 | 341.809524 | 158.000000 | 4093.771429 | 12.867619 | 73.742857 | 105 |
| 1 | 26.590244 | 4.574913 | 140.486063 | 84.885017 | 2569.222997 | 16.519512 | 76.797909 | 287 |
Clearly shows two distinct group with a difference in average between the clusters and variables
#plotting the clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="cluster_K",
data=cc_z1,
palette=['green','brown']);
part1.head()
| mpg | cyl | disp | hp | wt | acc | yr | car_name | mpg_level | car_company | origin_1 | origin_2 | origin_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | chevrolet chevelle malibu | medium | chevrolet | 1 | 0 | 0 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | buick skylark 320 | low | buick | 1 | 0 | 0 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | plymouth satellite | medium | plymouth | 1 | 0 | 0 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | amc rebel sst | low | amc | 1 | 0 | 0 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | ford torino | medium | ford | 1 | 0 | 0 |
Linear regression on the original dataset
X = part1.drop(['mpg','origin_1','mpg_level','car_name','car_company'], axis=1) #independent variable
# the dependent variable
y = part1['mpg']
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
LinearRegression()
print(regression_model.coef_)
[-3.38775426e-01 1.64385086e-02 1.81032733e-03 -6.86281725e-03 9.71082414e-02 7.74784998e-01 2.49437817e+00 2.94056913e+00]
intercept = regression_model.intercept_
intercept
-18.95965679078054
regression_model.score(X_train,y_train)
0.8426102428804347
O=regression_model.score(X_test, y_test)
O
0.7784605212174351
Linear regression on data with K means clustering
#renaming the cluster labels to light and heavy vehicles and creating dummy variables of it
carK['cluster_K']=carK['cluster_K'].astype('category')
carK['cluster_K'] = carK['cluster_K'].replace({1: 'heavy', 0: 'light'})
carK = pd.get_dummies(carK, columns=['cluster_K'])
carK.head()
| mpg | cyl | disp | hp | wt | acc | yr | car_name | mpg_level | car_company | origin_1 | origin_2 | origin_3 | cluster_K_heavy | cluster_K_light | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | chevrolet chevelle malibu | medium | chevrolet | 1 | 0 | 0 | 0 | 1 |
| 1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | buick skylark 320 | low | buick | 1 | 0 | 0 | 0 | 1 |
| 2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | plymouth satellite | medium | plymouth | 1 | 0 | 0 | 0 | 1 |
| 3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | amc rebel sst | low | amc | 1 | 0 | 0 | 0 | 1 |
| 4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | ford torino | medium | ford | 1 | 0 | 0 | 0 | 1 |
X = carK.drop(['mpg','origin_1','mpg_level','cluster_K_light','car_name','car_company'], axis=1)
# the dependent variable
y = carK['mpg']
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
LinearRegression()
print(regression_model.coef_)
[-1.08312935 0.01114995 -0.02605547 -0.00549942 0.04841365 0.72782783 1.76087694 1.99424648 -3.43785006]
intercept = regression_model.intercept_
intercept
-8.185822466161934
regression_model.score(X_train, y_train)
0.8377893163293161
K=regression_model.score(X_test, y_test)
K
0.806910901180337
Linear regression on data with Hierarchial Clustering
#renaming the cluster labels to light and heavy vehicles and creating summy variable of it
carH['clusters_H']=carH['clusters_H'].astype('category')
carH['clusters_H'] = carH['clusters_H'].replace({1: 'heavy', 2: 'light'})
carH = pd.get_dummies(carH, columns=['clusters_H'])
X = carH.drop(['mpg','origin_1','mpg_level','clusters_H_light','car_name','car_company'], axis=1)
# the dependent variable
y = carH[['mpg']]
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
LinearRegression()
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for cyl is -1.720050531659928 The coefficient for disp is 0.029099068131167607 The coefficient for hp is -0.04603605405708172 The coefficient for wt is -0.0059126672179700135 The coefficient for acc is -0.002342640798737021 The coefficient for yr is 0.7568898776253472 The coefficient for origin_2 is 1.6920286544033463 The coefficient for origin_3 is 2.881839084738278 The coefficient for clusters_H_heavy is 3.865658446801207
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is -9.591092355523084
regression_model.score(X_train, y_train)
0.8363531795217873
H=regression_model.score(X_test, y_test)
H
0.8059288127905487
modellists = []
modellists.append(['Linear Regression on Original Data set', O*100])
modellists.append(['Linear Regression with K means clusters', K*100])
modellists.append(['Linear Regression with Heirarchical clusters', H*100])
mdl_df = pd.DataFrame(modellists, columns = ['Model','r^2 on Test'])
mdl_df
| Model | r^2 on Test | |
|---|---|---|
| 0 | Linear Regression on Original Data set | 77.846052 |
| 1 | Linear Regression with K means clusters | 80.691090 |
| 2 | Linear Regression with Heirarchical clusters | 80.592881 |
6. Improvisation:
Detailed suggestions or improvements or on quality, quantity, variety, velocity, veracity etc. on the data points collected by the company to erform a better data analysis in future.
The reason the cars are being used would give information about the cars too , the datset doen capture that information.
The units are not mentioned in all the columns that would be beneficial too.
The Manufacturer information , like which country should be given.
No information relevant to the business need provided.
CONTEXT: Company X curates and packages wine across various vineyards spread throughout the country.
DATA DESCRIPTION: The data concerns the chemical composition of the wine and its respective quality.
Attribute Information:
1. A, B, C, D: specific chemical composition measure of the wine
2. Quality: quality of wine [ Low and High ]
PROJECT OBJECTIVE: Goal is to build a synthetic data generation model using the existing data provided by the company.
1. Design a synthetic data generation model which can impute values [Attribute: Quality] wherever empty the company has missed recording the data.
#importing the data
part2 = pd.read_excel("Part2 - Company.xlsx")
part2.head()
| A | B | C | D | Quality | |
|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality A |
| 1 | 174 | 133 | 134 | 166 | Quality B |
| 2 | 159 | 163 | 135 | 131 | NaN |
| 3 | 61 | 23 | 3 | 44 | Quality A |
| 4 | 59 | 60 | 9 | 68 | Quality A |
part2.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 A 61 non-null int64 1 B 61 non-null int64 2 C 61 non-null int64 3 D 61 non-null int64 4 Quality 43 non-null object dtypes: int64(4), object(1) memory usage: 2.5+ KB
part2.isnull().sum()
A 0 B 0 C 0 D 0 Quality 18 dtype: int64
There are 18 rows where the compnay has missed to input the quality of the wine sample.
part2.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| A | 61.0 | 118.557377 | 61.771494 | 3.0 | 61.0 | 136.0 | 171.0 | 200.0 |
| B | 61.0 | 116.639344 | 57.897908 | 5.0 | 63.0 | 138.0 | 168.0 | 200.0 |
| C | 61.0 | 114.081967 | 65.615368 | 3.0 | 59.0 | 135.0 | 174.0 | 200.0 |
| D | 61.0 | 114.459016 | 63.144431 | 3.0 | 51.0 | 132.0 | 174.0 | 199.0 |
row, column = part2.shape
print('The dataset contains', row, 'rows and', column, 'columns.')
The dataset contains 61 rows and 5 columns.
#make a copy of the data
part2copy=part2.copy()
part2copy['Quality'].value_counts()
Quality A 26 Quality B 17 Name: Quality, dtype: int64
#dropping the target variable
part2=part2.drop('Quality',axis=1)
#checking the distribution of the data points
part2.hist(bins = 10, figsize = (10, 8))
plt.show()
#checking for the density of the variable
plt.figure(figsize=(10, 8))
col = 1
for i in part2.columns:
plt.subplot(2, 2, col)
sns.distplot(part2[i])
col += 1
#scaling the numeric variables
w1 = part2.apply(zscore)
w1.head()
| A | B | C | D | |
|---|---|---|---|---|
| 0 | -1.168034 | -1.561080 | -1.061569 | -0.103138 |
| 1 | 0.904992 | 0.284923 | 0.306077 | 0.823013 |
| 2 | 0.660147 | 0.807376 | 0.321443 | 0.264129 |
| 3 | -0.939512 | -1.630740 | -1.706975 | -1.125099 |
| 4 | -0.972158 | -0.986381 | -1.614775 | -0.741864 |
#applying kmeans with 2 centroids
k_means = KMeans(n_clusters = 2)
k_means.fit(w1)
labels = k_means.labels_
# Calculating silhouette_score
silhouette_score(w1,labels)
0.6891674125195145
#attaching the cluster labels to the original dataset
part2["Quality_k"] = labels
part2["Quality"]= part2copy["Quality"]
#checking for the first 15 observations
part2.head(68)
| A | B | C | D | Quality_k | Quality | |
|---|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | 1 | Quality A |
| 1 | 174 | 133 | 134 | 166 | 0 | Quality B |
| 2 | 159 | 163 | 135 | 131 | 0 | NaN |
| 3 | 61 | 23 | 3 | 44 | 1 | Quality A |
| 4 | 59 | 60 | 9 | 68 | 1 | Quality A |
| ... | ... | ... | ... | ... | ... | ... |
| 56 | 200 | 186 | 185 | 179 | 0 | Quality B |
| 57 | 137 | 182 | 165 | 199 | 0 | NaN |
| 58 | 88 | 39 | 9 | 102 | 1 | NaN |
| 59 | 180 | 157 | 192 | 198 | 0 | NaN |
| 60 | 157 | 135 | 135 | 156 | 0 | NaN |
61 rows × 6 columns
#renaming the cluster labels
part2['Quality_k'] = part2['Quality_k'].replace({0: 'Quality A', 1: 'Quality B'})
part22 = part2.copy()
part22.head()
| A | B | C | D | Quality_k | Quality | |
|---|---|---|---|---|---|---|
| 0 | 47 | 27 | 45 | 108 | Quality B | Quality A |
| 1 | 174 | 133 | 134 | 166 | Quality A | Quality B |
| 2 | 159 | 163 | 135 | 131 | Quality A | NaN |
| 3 | 61 | 23 | 3 | 44 | Quality B | Quality A |
| 4 | 59 | 60 | 9 | 68 | Quality B | Quality A |
part22 = part22.dropna()
part22.isnull().sum()
A 0 B 0 C 0 D 0 Quality_k 0 Quality 0 dtype: int64
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
#renaming the cluster labels
part22['Quality_k'] = part22['Quality_k'].replace({'Quality A':0,'Quality B': 1})
part22['Quality'] = part22['Quality'].replace({'Quality A':0,'Quality B': 1})
part22['Quality'] = part22['Quality'].astype(int)
#using the cluster labels and actual target labels to check for accuracy
pred = part22['Quality_k']
actual=part22['Quality']
#visualization of confusion matrix in the form of a heatmap
cm= confusion_matrix(actual, pred)
plt.figure(figsize = (8, 4))
sns.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix HeatMap', fontsize = 15);
There appears to be no misclassification when checking the it with the non missing target variables and the predicted clusters, Hence the new labels can be used as a target variable for the data.
CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette.
The vehicle may be viewed from one of many different angles.
DATA DESCRIPTION: The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
All the features are numeric i.e. geometric features extracted from the silhouette.
PROJECT OBJECTIVE: Apply dimensionality reduction technique – PCA and train a model using principal components instead of training the model using just the raw data.
1. Data: Import, clean and pre-process the data
#import the dataset
ve = pd.read_csv('Part3 - vehicle.csv')
ve.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 95 | 48.0 | 83.0 | 178.0 | 72.0 | 10 | 162.0 | 42.0 | 20.0 | 159 | 176.0 | 379.0 | 184.0 | 70.0 | 6.0 | 16.0 | 187.0 | 197 | van |
| 1 | 91 | 41.0 | 84.0 | 141.0 | 57.0 | 9 | 149.0 | 45.0 | 19.0 | 143 | 170.0 | 330.0 | 158.0 | 72.0 | 9.0 | 14.0 | 189.0 | 199 | van |
| 2 | 104 | 50.0 | 106.0 | 209.0 | 66.0 | 10 | 207.0 | 32.0 | 23.0 | 158 | 223.0 | 635.0 | 220.0 | 73.0 | 14.0 | 9.0 | 188.0 | 196 | car |
| 3 | 93 | 41.0 | 82.0 | 159.0 | 63.0 | 9 | 144.0 | 46.0 | 19.0 | 143 | 160.0 | 309.0 | 127.0 | 63.0 | 6.0 | 10.0 | 199.0 | 207 | van |
| 4 | 85 | 44.0 | 70.0 | 205.0 | 103.0 | 52 | 149.0 | 45.0 | 19.0 | 144 | 241.0 | 325.0 | 188.0 | 127.0 | 9.0 | 11.0 | 180.0 | 183 | bus |
#checking for the dimension of the data
rows, column = ve.shape
print('The dataset contains', rows, 'rows and', column, 'columns.')
The dataset contains 846 rows and 19 columns.
print ("The dataset has",ve.size,"elements")
The dataset has 16074 elements
#checking for the data type
ve.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 846 entries, 0 to 845 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 compactness 846 non-null int64 1 circularity 841 non-null float64 2 distance_circularity 842 non-null float64 3 radius_ratio 840 non-null float64 4 pr.axis_aspect_ratio 844 non-null float64 5 max.length_aspect_ratio 846 non-null int64 6 scatter_ratio 845 non-null float64 7 elongatedness 845 non-null float64 8 pr.axis_rectangularity 843 non-null float64 9 max.length_rectangularity 846 non-null int64 10 scaled_variance 843 non-null float64 11 scaled_variance.1 844 non-null float64 12 scaled_radius_of_gyration 844 non-null float64 13 scaled_radius_of_gyration.1 842 non-null float64 14 skewness_about 840 non-null float64 15 skewness_about.1 845 non-null float64 16 skewness_about.2 845 non-null float64 17 hollows_ratio 846 non-null int64 18 class 846 non-null object dtypes: float64(14), int64(4), object(1) memory usage: 125.7+ KB
#Checking for duplicate rows
print('Duplicated rows: ', ve[ve.duplicated()].shape[0])
Duplicated rows: 0
#Finding the null values based on columns
ve.isnull().sum()
compactness 0 circularity 5 distance_circularity 4 radius_ratio 6 pr.axis_aspect_ratio 2 max.length_aspect_ratio 0 scatter_ratio 1 elongatedness 1 pr.axis_rectangularity 3 max.length_rectangularity 0 scaled_variance 3 scaled_variance.1 2 scaled_radius_of_gyration 2 scaled_radius_of_gyration.1 4 skewness_about 6 skewness_about.1 1 skewness_about.2 1 hollows_ratio 0 class 0 dtype: int64
#replacing the missing values with median values
for cols in ve.columns:
if(cols != 'class'):
ve[cols] = ve[cols].fillna(ve[cols].median())
#Statistical Summary
ve.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| compactness | 846.0 | 93.678487 | 8.234474 | 73.0 | 87.00 | 93.0 | 100.00 | 119.0 |
| circularity | 846.0 | 44.823877 | 6.134272 | 33.0 | 40.00 | 44.0 | 49.00 | 59.0 |
| distance_circularity | 846.0 | 82.100473 | 15.741569 | 40.0 | 70.00 | 80.0 | 98.00 | 112.0 |
| radius_ratio | 846.0 | 168.874704 | 33.401356 | 104.0 | 141.00 | 167.0 | 195.00 | 333.0 |
| pr.axis_aspect_ratio | 846.0 | 61.677305 | 7.882188 | 47.0 | 57.00 | 61.0 | 65.00 | 138.0 |
| max.length_aspect_ratio | 846.0 | 8.567376 | 4.601217 | 2.0 | 7.00 | 8.0 | 10.00 | 55.0 |
| scatter_ratio | 846.0 | 168.887707 | 33.197710 | 112.0 | 147.00 | 157.0 | 198.00 | 265.0 |
| elongatedness | 846.0 | 40.936170 | 7.811882 | 26.0 | 33.00 | 43.0 | 46.00 | 61.0 |
| pr.axis_rectangularity | 846.0 | 20.580378 | 2.588558 | 17.0 | 19.00 | 20.0 | 23.00 | 29.0 |
| max.length_rectangularity | 846.0 | 147.998818 | 14.515652 | 118.0 | 137.00 | 146.0 | 159.00 | 188.0 |
| scaled_variance | 846.0 | 188.596927 | 31.360427 | 130.0 | 167.00 | 179.0 | 217.00 | 320.0 |
| scaled_variance.1 | 846.0 | 439.314421 | 176.496341 | 184.0 | 318.25 | 363.5 | 586.75 | 1018.0 |
| scaled_radius_of_gyration | 846.0 | 174.706856 | 32.546277 | 109.0 | 149.00 | 173.5 | 198.00 | 268.0 |
| scaled_radius_of_gyration.1 | 846.0 | 72.443262 | 7.468734 | 59.0 | 67.00 | 71.5 | 75.00 | 135.0 |
| skewness_about | 846.0 | 6.361702 | 4.903244 | 0.0 | 2.00 | 6.0 | 9.00 | 22.0 |
| skewness_about.1 | 846.0 | 12.600473 | 8.930962 | 0.0 | 5.00 | 11.0 | 19.00 | 41.0 |
| skewness_about.2 | 846.0 | 188.918440 | 6.152247 | 176.0 | 184.00 | 188.0 | 193.00 | 206.0 |
| hollows_ratio | 846.0 | 195.632388 | 7.438797 | 181.0 | 190.25 | 197.0 | 201.00 | 211.0 |
2. EDA and visualisation: Create a detailed performance report using univariate, bi-variate and multivariate EDA techniques. Find out all possible hiddenpatterns by using all possible methods.
Univariate Analysis
col3 = ['compactness', 'circularity', 'distance_circularity', 'radius_ratio',
'pr.axis_aspect_ratio', 'max.length_aspect_ratio', 'scatter_ratio',
'elongatedness', 'pr.axis_rectangularity', 'max.length_rectangularity',
'scaled_variance', 'scaled_variance.1', 'scaled_radius_of_gyration',
'scaled_radius_of_gyration.1', 'skewness_about', 'skewness_about.1',
'skewness_about.2', 'hollows_ratio']
for i in col3:
f, axes = plt.subplots(1, 2, figsize=(10,5))
sns.distplot(ve[i], ax=axes[0],color = 'forestgreen')
sns.boxplot(x = i, data=ve, orient='h' , ax=axes[1],color = 'darkseagreen')
axes[0].set_title('Distribution plot of {}'.format(i))
axes[1].set_title('Box plot of {}'.format(i))
plt.show()
#checking count of outliers.
q25,q75=np.percentile(ve[i],25),np.percentile(ve[i],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[j for j in ve[i] if j < lower or j > upper]
b=len(Outliers)
print('Total Number of outliers in {} {}'.format(i,b))
Total Number of outliers in compactness 0
Total Number of outliers in circularity 0
Total Number of outliers in distance_circularity 0
Total Number of outliers in radius_ratio 3
Total Number of outliers in pr.axis_aspect_ratio 8
Total Number of outliers in max.length_aspect_ratio 13
Total Number of outliers in scatter_ratio 0
Total Number of outliers in elongatedness 0
Total Number of outliers in pr.axis_rectangularity 0
Total Number of outliers in max.length_rectangularity 0
Total Number of outliers in scaled_variance 1
Total Number of outliers in scaled_variance.1 2
Total Number of outliers in scaled_radius_of_gyration 0
Total Number of outliers in scaled_radius_of_gyration.1 15
Total Number of outliers in skewness_about 12
Total Number of outliers in skewness_about.1 1
Total Number of outliers in skewness_about.2 0
Total Number of outliers in hollows_ratio 0
#checking the distribution of the class variable
print(ve['class'].value_counts())
plt.title('Count of Vehicle Class column')
sns.countplot(x = 'class', data = ve);
car 429 bus 218 van 199 Name: class, dtype: int64
Bivariate Analysis
#Boxplots against the class with other numeric variables
for i in col3:
fig,axs = plt.subplots(1,1,figsize=(5,5))
sns.boxplot(x ='class',y=ve[i], data=ve,color = 'darkseagreen')
#find the outliers and replace them by median
for col_name in ve.drop(columns = 'class').columns:
q1 = ve[col_name].quantile(0.25)
q3 = ve[col_name].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr
ve.loc[(ve[col_name] < low) | (ve[col_name] > high), col_name] = ve[col_name].median()
#boxplot after outlier treatment
plt.figure(figsize=(25,23))
col = 1
for i in ve.drop(columns='class').columns:
plt.subplot(6, 4, col)
sns.boxplot(ve[i])
col += 1
Multivariate Analysis
sns.pairplot(ve,hue='class')
<seaborn.axisgrid.PairGrid at 0x1fa0a5d9070>
Most of the columns look like they have linear relationships and hence we will see the correlation of all the attributes.
ve.corr()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| compactness | 1.000000 | 0.684887 | 0.789928 | 0.721925 | 0.192864 | 0.499928 | 0.812620 | -0.788750 | 0.813694 | 0.676143 | 0.769871 | 0.806170 | 0.585243 | -0.246681 | 0.197308 | 0.156348 | 0.298537 | 0.365552 |
| circularity | 0.684887 | 1.000000 | 0.792320 | 0.638280 | 0.203253 | 0.560470 | 0.847938 | -0.821472 | 0.843400 | 0.961318 | 0.802768 | 0.827462 | 0.925816 | 0.068745 | 0.136351 | -0.009666 | -0.104426 | 0.046351 |
| distance_circularity | 0.789928 | 0.792320 | 1.000000 | 0.794222 | 0.244332 | 0.666809 | 0.905076 | -0.911307 | 0.893025 | 0.774527 | 0.869584 | 0.883943 | 0.705771 | -0.229353 | 0.099107 | 0.262345 | 0.146098 | 0.332732 |
| radius_ratio | 0.721925 | 0.638280 | 0.794222 | 1.000000 | 0.650554 | 0.463958 | 0.769941 | -0.825392 | 0.744139 | 0.579468 | 0.786183 | 0.760257 | 0.550774 | -0.390459 | 0.035755 | 0.179601 | 0.405849 | 0.491758 |
| pr.axis_aspect_ratio | 0.192864 | 0.203253 | 0.244332 | 0.650554 | 1.000000 | 0.150295 | 0.194195 | -0.298144 | 0.163047 | 0.147592 | 0.207101 | 0.196401 | 0.148591 | -0.321070 | -0.056030 | -0.021088 | 0.400882 | 0.415734 |
| max.length_aspect_ratio | 0.499928 | 0.560470 | 0.666809 | 0.463958 | 0.150295 | 1.000000 | 0.490759 | -0.504181 | 0.487931 | 0.642713 | 0.401391 | 0.463249 | 0.397397 | -0.335444 | 0.081898 | 0.141664 | 0.083794 | 0.413174 |
| scatter_ratio | 0.812620 | 0.847938 | 0.905076 | 0.769941 | 0.194195 | 0.490759 | 1.000000 | -0.971601 | 0.989751 | 0.809083 | 0.960883 | 0.980447 | 0.799875 | 0.011314 | 0.064242 | 0.211647 | 0.005628 | 0.118817 |
| elongatedness | -0.788750 | -0.821472 | -0.911307 | -0.825392 | -0.298144 | -0.504181 | -0.971601 | 1.000000 | -0.948996 | -0.775854 | -0.947644 | -0.948851 | -0.766314 | 0.078391 | -0.046943 | -0.183642 | -0.115126 | -0.216905 |
| pr.axis_rectangularity | 0.813694 | 0.843400 | 0.893025 | 0.744139 | 0.163047 | 0.487931 | 0.989751 | -0.948996 | 1.000000 | 0.810934 | 0.947329 | 0.973606 | 0.796690 | 0.027545 | 0.073127 | 0.213801 | -0.018649 | 0.099286 |
| max.length_rectangularity | 0.676143 | 0.961318 | 0.774527 | 0.579468 | 0.147592 | 0.642713 | 0.809083 | -0.775854 | 0.810934 | 1.000000 | 0.750222 | 0.789632 | 0.866450 | 0.053856 | 0.130702 | 0.004129 | -0.103948 | 0.076770 |
| scaled_variance | 0.769871 | 0.802768 | 0.869584 | 0.786183 | 0.207101 | 0.401391 | 0.960883 | -0.947644 | 0.947329 | 0.750222 | 1.000000 | 0.943780 | 0.785073 | 0.025828 | 0.024693 | 0.197122 | 0.015171 | 0.086330 |
| scaled_variance.1 | 0.806170 | 0.827462 | 0.883943 | 0.760257 | 0.196401 | 0.463249 | 0.980447 | -0.948851 | 0.973606 | 0.789632 | 0.943780 | 1.000000 | 0.782972 | 0.009386 | 0.065731 | 0.204941 | 0.017557 | 0.119642 |
| scaled_radius_of_gyration | 0.585243 | 0.925816 | 0.705771 | 0.550774 | 0.148591 | 0.397397 | 0.799875 | -0.766314 | 0.796690 | 0.866450 | 0.785073 | 0.782972 | 1.000000 | 0.215279 | 0.162970 | -0.055667 | -0.224450 | -0.118002 |
| scaled_radius_of_gyration.1 | -0.246681 | 0.068745 | -0.229353 | -0.390459 | -0.321070 | -0.335444 | 0.011314 | 0.078391 | 0.027545 | 0.053856 | 0.025828 | 0.009386 | 0.215279 | 1.000000 | -0.057755 | -0.123996 | -0.832738 | -0.901332 |
| skewness_about | 0.197308 | 0.136351 | 0.099107 | 0.035755 | -0.056030 | 0.081898 | 0.064242 | -0.046943 | 0.073127 | 0.130702 | 0.024693 | 0.065731 | 0.162970 | -0.057755 | 1.000000 | -0.041734 | 0.086661 | 0.062619 |
| skewness_about.1 | 0.156348 | -0.009666 | 0.262345 | 0.179601 | -0.021088 | 0.141664 | 0.211647 | -0.183642 | 0.213801 | 0.004129 | 0.197122 | 0.204941 | -0.055667 | -0.123996 | -0.041734 | 1.000000 | 0.074473 | 0.200651 |
| skewness_about.2 | 0.298537 | -0.104426 | 0.146098 | 0.405849 | 0.400882 | 0.083794 | 0.005628 | -0.115126 | -0.018649 | -0.103948 | 0.015171 | 0.017557 | -0.224450 | -0.832738 | 0.086661 | 0.074473 | 1.000000 | 0.892581 |
| hollows_ratio | 0.365552 | 0.046351 | 0.332732 | 0.491758 | 0.415734 | 0.413174 | 0.118817 | -0.216905 | 0.099286 | 0.076770 | 0.086330 | 0.119642 | -0.118002 | -0.901332 | 0.062619 | 0.200651 | 0.892581 | 1.000000 |
plt.figure(figsize=(12,8))
sns.heatmap(ve.corr(),annot=True)
<AxesSubplot:>
#spliiting the data to check for correlation with the class variable
X = ve.loc[:, ve.columns != 'class']
y = ve['class'].astype('category').cat.codes
#plotting the correlation with target variable
plt.figure(figsize = (15, 8))
ax=sns.barplot(x=X.columns, y = X.corrwith(y))
sns.barplot(x = X.columns, y = X.corrwith(y))
plt.title('Correlation with Class column', fontsize = 20)
x=plt.setp(ax.get_xticklabels(), rotation=90)
3. Classifier: Design and train a best fit SVM classier using all the data attributes.
#scaling the muerical variables
XScaled=X.apply(zscore)
XScaled.head()
| compactness | circularity | distance_circularity | radius_ratio | pr.axis_aspect_ratio | max.length_aspect_ratio | scatter_ratio | elongatedness | pr.axis_rectangularity | max.length_rectangularity | scaled_variance | scaled_variance.1 | scaled_radius_of_gyration | scaled_radius_of_gyration.1 | skewness_about | skewness_about.1 | skewness_about.2 | hollows_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.160580 | 0.518073 | 0.057177 | 0.300945 | 1.933135 | 0.912212 | -0.207598 | 0.136262 | -0.224342 | 0.758332 | -0.400771 | -0.337407 | 0.285705 | -0.315806 | -0.032330 | 0.387162 | -0.312012 | 0.183957 |
| 1 | -0.325470 | -0.623732 | 0.120741 | -0.850666 | -0.740596 | 0.427456 | -0.599423 | 0.520519 | -0.610886 | -0.344578 | -0.594220 | -0.618623 | -0.513630 | 0.009122 | 0.624090 | 0.161740 | 0.013265 | 0.452977 |
| 2 | 1.254193 | 0.844303 | 1.519141 | 1.265808 | 0.863642 | 0.912212 | 1.148719 | -1.144597 | 0.935290 | 0.689401 | 1.114582 | 1.131806 | 1.392477 | 0.171586 | 1.718123 | -0.401818 | -0.149374 | 0.049447 |
| 3 | -0.082445 | -0.623732 | -0.006386 | -0.290423 | 0.328896 | 0.427456 | -0.750125 | 0.648605 | -0.610886 | -0.344578 | -0.916635 | -0.739145 | -1.466683 | -1.453054 | -0.032330 | -0.289106 | 1.639649 | 1.529056 |
| 4 | -1.054545 | -0.134387 | -0.769150 | 1.141310 | -0.027601 | -0.057300 | -0.599423 | 0.520519 | -0.610886 | -0.275646 | 1.694930 | -0.647319 | 0.408680 | -0.072110 | 0.624090 | -0.176395 | -1.450481 | -1.699181 |
#splitting the original data into train and test 70:30
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size = 0.3, random_state = 10)
rtr, ctr = X_train.shape
print('The training set comprises of', rtr, 'rows and', ctr, 'columns.')
The training set comprises of 592 rows and 18 columns.
rt, ct = X_test.shape
print('The test set comprises of', rt, 'rows and', ct, 'columns.')
The test set comprises of 254 rows and 18 columns.
# Building a Support Vector Machine on train data
svc_model = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model.fit(X_train, y_train)
SVC(C=4)
#predicting on train data
sv_train_predict = svc_model .predict(X_train)
print("Model Accuracy on train: {0:.4f}".format(metrics.accuracy_score(y_train, sv_train_predict)))
Model Accuracy on train: 0.9899
#predicting on test data
sv_test_predict = svc_model .predict(X_test)
print("Model Accuracy on test: {0:.4f}".format(metrics.accuracy_score(y_test, sv_test_predict)))
Model Accuracy on test: 0.9685
#visualization of confusion matrix in the form of a heatmap
cm= confusion_matrix(y_test, sv_test_predict)
plt.figure(figsize = (10, 5))
sns.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_test, sv_test_predict, labels=[0,1,2]))
Classification Report
precision recall f1-score support
0 1.00 0.99 0.99 71
1 0.98 0.96 0.97 125
2 0.90 0.97 0.93 58
accuracy 0.97 254
macro avg 0.96 0.97 0.97 254
weighted avg 0.97 0.97 0.97 254
precision_SV, recall_SV, f1_score_SV, support = precision_recall_fscore_support(y_test, sv_test_predict,average='macro')
print('Precision Score :', '%0.2f' % precision_SV)
print('Recall Score :', '%0.2f' % recall_SV)
print('F1-Score:', '%0.2f' % f1_score_SV)
SV_Acc= accuracy_score(y_test, sv_test_predict)
print('Accuracy Score :','%0.2f' % SV_Acc)
Precision Score : 0.96 Recall Score : 0.97 F1-Score: 0.97 Accuracy Score : 0.97
4. Dimensional reduction: perform dimensional reduction on the data.
#plotting the cummulative variance explained by the principal componets
pca = PCA()
X_pca_ = pca.fit_transform(XScaled)
plt.figure(figsize = (10, 5))
plt.plot((np.cumsum(pca.explained_variance_ratio_) * 100), marker = 'X')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Cumulative Explained Variance');
print(pca.explained_variance_)
[9.74940269e+00 3.35071912e+00 1.19238155e+00 1.13381916e+00 8.83997312e-01 6.66265745e-01 3.18150910e-01 2.28179142e-01 1.31018595e-01 7.98619108e-02 7.33979478e-02 6.46162669e-02 4.01448646e-02 3.22758478e-02 2.93936408e-02 2.27005257e-02 1.98136761e-02 5.16287320e-03]
#plotting the
plt.figure(figsize = (12, 8))
plt.step(list(range(18)), (np.cumsum(pca.explained_variance_ratio_) * 100), where = 'mid')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Cumulative Explained Variance')
plt.title('Vehicle Dataset Explained Variance');
#Using 6 components and printing the eigen vectors
pca3 = PCA(n_components=6)
pca3.fit(XScaled)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(XScaled)
[[ 0.27250289 0.28725469 0.30242111 0.26971354 0.09786073 0.19520014 0.31052393 -0.3090069 0.307287 0.27815416 0.29976509 0.30553237 0.26323762 -0.04193594 0.03608321 0.05872048 0.03801314 0.08474 ] [-0.08704358 0.13162176 -0.04614301 -0.19793126 -0.25783995 -0.10804563 0.07528535 -0.01322994 0.0875602 0.12215424 0.07726575 0.07150302 0.21058205 0.50362158 -0.01576632 -0.09274624 -0.50162122 -0.50761211] [-0.03818521 -0.20114691 0.06346211 0.05628517 -0.06199275 -0.14895782 0.10904283 -0.09085269 0.1060705 -0.21368469 0.1445998 0.11034374 -0.20287019 0.07386402 -0.55917399 0.6706805 -0.06224071 -0.04170535] [ 0.13867501 -0.03805548 0.10895429 -0.25435509 -0.61276572 0.27867816 0.00539295 0.06521486 0.03089915 0.04146747 -0.06400509 -0.00219687 -0.08553965 -0.11539962 0.47370331 0.42842603 -0.0274096 0.09603749] [ 0.13710147 -0.13899556 -0.08001743 0.13374437 0.12360146 -0.63489336 0.08555745 -0.07907345 0.08164638 -0.25111293 0.14747123 0.11010098 -0.00521211 0.1380686 0.56655224 0.13086982 0.18051929 -0.11078807] [ 0.26361138 -0.07134742 -0.01690062 -0.13818365 -0.57782861 -0.289097 0.09774711 -0.07572829 0.10540323 -0.07819621 0.1329124 0.11539822 -0.0670574 -0.13151308 -0.31917609 -0.46840497 0.28013644 0.05944441]] [0.54099325 0.18593103 0.06616512 0.0629155 0.04905291 0.03697101]
#printing the original features and the reduced features
pca_6 = PCA(n_components = 6)
X_pca = pca_6.fit_transform(XScaled)
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_pca.shape[1])
Original number of features: 18 Reduced number of features: 6
#viewing the first 5 observations of the pca components
pca_df = pd.DataFrame(data = X_pca)
pca_df.head()
| 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|
| 0 | 0.584228 | -0.675673 | -0.453334 | -0.750656 | -0.777515 | -1.848809 |
| 1 | -1.512180 | -0.348934 | -0.333436 | 1.268953 | -0.324929 | -0.118317 |
| 2 | 3.913448 | 0.234507 | -1.265094 | 0.137224 | 0.915751 | -0.685594 |
| 3 | -1.535193 | -3.044413 | -0.469623 | 0.324317 | -0.611590 | 0.367777 |
| 4 | -0.642062 | 1.488882 | -0.246288 | -0.550939 | 0.471655 | -1.012697 |
sns.pairplot(pca_df, diag_kind = 'kde');
5. Classifier: Design and train a best fit SVM classier using dimensionally reduced attributes.
#splitting the pca data into train and test 70:30
X_tr, X_te, y_tr, y_te = train_test_split(X_pca, y, test_size = 0.3, random_state = 10)
rtr_pca, ctr_pca = X_tr.shape
print('The PCA training set comprises of', rtr_pca, 'rows and', ctr_pca, 'columns.')
The PCA training set comprises of 592 rows and 6 columns.
rt_pca, ct_pca = X_te.shape
print('The PCA test set comprises of', rt_pca, 'rows and', ct_pca, 'columns.')
The PCA test set comprises of 254 rows and 6 columns.
#SVM on the pca data
svc_model_pca = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model_pca.fit(X_tr, y_tr)
SVC(C=4)
#predicting on train data
sv_tr_predict = svc_model_pca .predict(X_tr)
print("Model Accuracy on train: {0:.4f}".format(metrics.accuracy_score(y_tr, sv_tr_predict)))
print()
Model Accuracy on train: 0.9476
#predicting on test data
sv_te_predict = svc_model_pca .predict(X_te)
print("Model Accuracy on test: {0:.4f}".format(metrics.accuracy_score(y_te, sv_te_predict)))
print()
Model Accuracy on test: 0.9213
#visualization of confusion matrix in the form of a heatmap
cm= confusion_matrix(y_te, sv_te_predict)
plt.figure(figsize = (10, 5))
sns.heatmap(cm, annot = True, fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_te, sv_te_predict, labels=[0,1,2]))
Classification Report
precision recall f1-score support
0 0.96 0.94 0.95 71
1 0.94 0.91 0.93 125
2 0.84 0.91 0.88 58
accuracy 0.92 254
macro avg 0.91 0.92 0.92 254
weighted avg 0.92 0.92 0.92 254
precision_SV_pca, recall_SV_pca, f1_score_SV_pca, support_pca = precision_recall_fscore_support(y_te, sv_te_predict,average='macro')
print('Precision Score :', '%0.2f' % precision_SV_pca)
print('Recall Score :', '%0.2f' % recall_SV_pca)
print('F1-Score:', '%0.2f' % f1_score_SV_pca)
SV_Acc_pca= accuracy_score(y_te, sv_te_predict)
print('Accuracy Score :','%0.2f' % SV_Acc_pca)
Precision Score : 0.91 Recall Score : 0.92 F1-Score: 0.92 Accuracy Score : 0.92
6. Conclusion: Showcase key pointer on how dimensional reduction helped in this case.
modellists = []
modellists.append(['Support Vector Classifier without PCA', SV_Acc * 100, recall_SV * 100, precision_SV * 100,f1_score_SV*100])
modellists.append(['Support Vector Classifier with PCA', SV_Acc_pca* 100, recall_SV_pca * 100, precision_SV_pca * 100,f1_score_SV_pca*100])
mdl_df = pd.DataFrame(modellists, columns = ['Model','Accuracy Score of Test Data', 'Recall Score', 'Precision Score','F1 Score'])
mdl_df
| Model | Accuracy Score of Test Data | Recall Score | Precision Score | F1 Score | |
|---|---|---|---|---|---|
| 0 | Support Vector Classifier without PCA | 96.850394 | 97.047758 | 96.227745 | 96.596702 |
| 1 | Support Vector Classifier with PCA | 92.125984 | 92.315169 | 91.352049 | 91.773898 |
DOMAIN: Sports management
CONTEXT: Company X is a sports management company for international cricket.
DATA DESCRIPTION: The data is collected belongs to batsman from IPL series conducted so far. Attribute Information:
1. Runs: Runs score by the batsman
2. Ave: Average runs scored by the batsman per match
3. SR: strike rate of the batsman
4. Fours: number of boundary/four scored
5. Six: number of boundary/six scored
6. HF: number of half centuries scored so far
PROJECT OBJECTIVE: Goal is to build a data driven batsman ranking model for the sports management company to make business decisions.
1. EDA and visualisation: Create a detailed performance report using univariate, bi-variate and multivariate EDA techniques. Find out all possible hidden
patterns by using all possible methods.
#importing the data
bb=pd.read_csv('Part4 - batting_bowling_ipl_bat.csv')
bb=bb.dropna(axis=0)
bb.head()
| Name | Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|---|
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 |
| 3 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 |
| 5 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 |
| 7 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 |
| 9 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 |
bb.shape, bb.size
((90, 7), 630)
#checking for the dimension of the data
rows, column = bb.shape
print('The dataset contains', rows, 'rows and', column, 'columns.')
print('The dataset has',bb.size, 'elements')
The dataset contains 90 rows and 7 columns. The dataset has 630 elements
bb.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 90 entries, 1 to 179 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Name 90 non-null object 1 Runs 90 non-null float64 2 Ave 90 non-null float64 3 SR 90 non-null float64 4 Fours 90 non-null float64 5 Sixes 90 non-null float64 6 HF 90 non-null float64 dtypes: float64(6), object(1) memory usage: 5.6+ KB
#Statistical Summary
bb.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Runs | 90.0 | 219.933333 | 156.253669 | 2.00 | 98.000 | 196.500 | 330.7500 | 733.00 |
| Ave | 90.0 | 24.729889 | 13.619215 | 0.50 | 14.665 | 24.440 | 32.1950 | 81.33 |
| SR | 90.0 | 119.164111 | 23.656547 | 18.18 | 108.745 | 120.135 | 131.9975 | 164.10 |
| Fours | 90.0 | 19.788889 | 16.399845 | 0.00 | 6.250 | 16.000 | 28.0000 | 73.00 |
| Sixes | 90.0 | 7.577778 | 8.001373 | 0.00 | 3.000 | 6.000 | 10.0000 | 59.00 |
| HF | 90.0 | 1.188889 | 1.688656 | 0.00 | 0.000 | 0.500 | 2.0000 | 9.00 |
print('Duplicated rows: ', bb[bb.duplicated()].shape[0])
Duplicated rows: 0
plt.figure(figsize=(15,8))
b1=bb.groupby('Name')['Runs'].sum().sort_values(ascending = False ).head(10)
b1= b1.reset_index()
b1.columns = ['Name' ,'Runs']
sns.barplot(data= b1 , x= 'Name' , y ='Runs');
plt.title("Top 10 Players by Runs");
plt.figure(figsize=(15,8))
b1=bb.groupby('Name')['Ave'].sum().sort_values(ascending = False ).head(15)
b1= b1.reset_index()
b1.columns = ['Name' ,'Ave']
sns.barplot(data= b1 , x= 'Name' , y ='Ave')
plt.title("Top 15 players by Average");
plt.figure(figsize=(15,8))
b1=bb.groupby('Name')['SR'].sum().sort_values(ascending = False ).head(15)
b1= b1.reset_index()
b1.columns = ['Name' ,'SR']
sns.barplot(data= b1 , x= 'Name' , y ='SR')
plt.title("Top 15 players by Strike Rate");
plt.figure(figsize=(15,8))
b1=bb.groupby('Name')['Fours'].sum().sort_values(ascending = False ).head(10)
b1= b1.reset_index()
b1.columns = ['Name' ,'Fours']
sns.barplot(data= b1 , x= 'Name' , y ='Fours')
plt.title("Top 10 players by Fours");
plt.figure(figsize=(15,8))
b1=bb.groupby('Name')['Sixes'].sum().sort_values(ascending = False ).head(10)
b1= b1.reset_index()
b1.columns = ['Name' ,'Sixes']
sns.barplot(data= b1 , x= 'Name' , y ='Sixes')
plt.title("Top 10 players by Sixes");
plt.figure(figsize=(15,8))
b1=bb.groupby('Name')['HF'].sum().sort_values(ascending = False ).head(10)
b1= b1.reset_index()
b1.columns = ['Name' ,'HF']
sns.barplot(data= b1 , x= 'Name' , y ='HF')
plt.title("Top 10 players by Half Centuries");
plt.figure(figsize=(15, 12))
col = 1
for i in bb.drop(columns='Name').columns:
plt.subplot(2, 3, col)
sns.distplot(bb[i])
col += 1
col4 = ['Runs', 'Ave', 'SR', 'Fours', 'Sixes', 'HF']
for i in col4:
f, axes = plt.subplots(1, 2, figsize=(10,5))
sns.distplot(bb[i], ax=axes[0],color = 'forestgreen')
sns.boxplot(x = i, data=bb, orient='h' , ax=axes[1],color = 'darkseagreen')
axes[0].set_title('Distribution plot of {}'.format(i))
axes[1].set_title('Box plot of {}'.format(i))
plt.show()
#checking count of outliers.
q25,q75=np.percentile(bb[i],25),np.percentile(bb[i],75)
IQR=q75-q25
Threshold=IQR*1.5
lower,upper=q25-Threshold,q75+Threshold
Outliers=[j for j in bb[i] if j < lower or j > upper]
b=len(Outliers)
print('Total Number of outliers in {} {}'.format(i,b))
Total Number of outliers in Runs 1
Total Number of outliers in Ave 3
Total Number of outliers in SR 5
Total Number of outliers in Fours 3
Total Number of outliers in Sixes 1
Total Number of outliers in HF 2
There are outliers in the data,but we will not be treating them as its highly likely that these are genuine observation.
#checking for correlation
plt.figure(figsize=(8,5))
corr=bb.drop(columns='Name').corr()
sns.heatmap(corr,annot=True);
We can observe that all the variables have high correlation
2. Model Builiding: Build a data driven model to rank all the players in the dataset using all or the most important performance features.
#scaling the numeric variables
cc = bb.iloc[:,1:7]
cc1 = cc.apply(zscore)
cc1.head()
| Runs | Ave | SR | Fours | Sixes | HF | |
|---|---|---|---|---|---|---|
| 1 | 3.301945 | 2.683984 | 1.767325 | 1.607207 | 6.462679 | 4.651551 |
| 3 | 2.381639 | 0.896390 | 1.036605 | 2.710928 | 1.184173 | 2.865038 |
| 5 | 1.770248 | 0.610640 | 1.788154 | 2.281703 | 1.435530 | 2.269533 |
| 7 | 1.667276 | 1.388883 | 1.297182 | 1.300618 | 1.561209 | 2.269533 |
| 9 | 2.246490 | 1.174755 | 0.444038 | 2.343021 | 1.309851 | 2.269533 |
#checking for the within sum of squares
wss =[]
for i in range(1,6):
KM = KMeans(n_clusters=i)
KM.fit(cc1)
wss.append(KM.inertia_)
wss
[540.0000000000001, 296.26135354732025, 223.36518460916952, 178.1750796759041, 144.68430051153075]
#plotting to check for optimal clustres
plt.plot(range(1,6), wss);
plt.title('Elbow Method');
plt.xlabel("Number of Clusters")
plt.ylabel("WSS");
#using 2 centroids
k_means = KMeans(n_clusters = 2)
k_means.fit(cc1)
labels = k_means.labels_
# Calculating silhouette_score
silhouette_score(cc1,labels)
0.41111085574076756
#plotting silhouette score for different centroids
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"max_iter": 300,
"random_state": 42,
}
silhouette_coefficients = []
# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 6):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(cc1)
score = silhouette_score(cc1,kmeans.labels_)
silhouette_coefficients.append(score)
plt.plot(range(2,6), silhouette_coefficients)
plt.xticks(range(2, 6))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
#attaching the labels to the original data
bb['cluster']=labels
bb.head()
| Name | Runs | Ave | SR | Fours | Sixes | HF | cluster | |
|---|---|---|---|---|---|---|---|---|
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | 1 |
| 3 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 | 1 |
| 5 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 | 1 |
| 7 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 | 1 |
| 9 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 | 1 |
#veiwing the distribution of the clusters
bb.cluster.value_counts().sort_index()
0 55 1 35 Name: cluster, dtype: int64
#aggregating the clusters with the numeric variables with their mean
aggdata=bb.iloc[:,0:9].groupby('cluster').mean()
aggdata['Freq']=bb.cluster.value_counts().sort_index()
aggdata
| Runs | Ave | SR | Fours | Sixes | HF | Freq | |
|---|---|---|---|---|---|---|---|
| cluster | |||||||
| 0 | 122.745455 | 17.582000 | 110.481455 | 10.309091 | 3.636364 | 0.218182 | 55 |
| 1 | 372.657143 | 35.962286 | 132.808286 | 34.685714 | 13.771429 | 2.714286 | 35 |
#based on the above table renaming/ranking the playes in Grade A and Grade B
bb['cluster'] = bb['cluster'].replace({1: 'Grade A', 0: 'Grade B'})
#list of Grade A players
Grade_A = bb[bb['cluster'] == 'Grade A']
Grade_A
| Name | Runs | Ave | SR | Fours | Sixes | HF | cluster | |
|---|---|---|---|---|---|---|---|---|
| 1 | CH Gayle | 733.0 | 61.08 | 160.74 | 46.0 | 59.0 | 9.0 | Grade A |
| 3 | G Gambhir | 590.0 | 36.87 | 143.55 | 64.0 | 17.0 | 6.0 | Grade A |
| 5 | V Sehwag | 495.0 | 33.00 | 161.23 | 57.0 | 19.0 | 5.0 | Grade A |
| 7 | CL White | 479.0 | 43.54 | 149.68 | 41.0 | 20.0 | 5.0 | Grade A |
| 9 | S Dhawan | 569.0 | 40.64 | 129.61 | 58.0 | 18.0 | 5.0 | Grade A |
| 11 | AM Rahane | 560.0 | 40.00 | 129.33 | 73.0 | 10.0 | 5.0 | Grade A |
| 13 | KP Pietersen | 305.0 | 61.00 | 147.34 | 22.0 | 20.0 | 3.0 | Grade A |
| 15 | RG Sharma | 433.0 | 30.92 | 126.60 | 39.0 | 18.0 | 5.0 | Grade A |
| 17 | AB de Villiers | 319.0 | 39.87 | 161.11 | 26.0 | 15.0 | 3.0 | Grade A |
| 19 | JP Duminy | 244.0 | 81.33 | 128.42 | 13.0 | 11.0 | 2.0 | Grade A |
| 21 | DA Warner | 256.0 | 36.57 | 164.10 | 28.0 | 14.0 | 3.0 | Grade A |
| 23 | SR Watson | 255.0 | 42.50 | 151.78 | 26.0 | 14.0 | 2.0 | Grade A |
| 25 | F du Plessis | 398.0 | 33.16 | 130.92 | 29.0 | 17.0 | 3.0 | Grade A |
| 27 | OA Shah | 340.0 | 37.77 | 132.81 | 24.0 | 16.0 | 3.0 | Grade A |
| 29 | DJ Bravo | 371.0 | 46.37 | 140.53 | 20.0 | 20.0 | 0.0 | Grade A |
| 31 | DJ Hussey | 396.0 | 33.00 | 129.83 | 28.0 | 17.0 | 2.0 | Grade A |
| 33 | SK Raina | 441.0 | 25.94 | 135.69 | 36.0 | 19.0 | 1.0 | Grade A |
| 35 | AT Rayudu | 333.0 | 37.00 | 132.14 | 21.0 | 14.0 | 2.0 | Grade A |
| 37 | Mandeep Singh | 432.0 | 27.00 | 126.31 | 53.0 | 7.0 | 2.0 | Grade A |
| 39 | R Dravid | 462.0 | 28.87 | 112.13 | 63.0 | 4.0 | 2.0 | Grade A |
| 41 | DR Smith | 157.0 | 39.25 | 160.20 | 18.0 | 7.0 | 1.0 | Grade A |
| 43 | M Vijay | 336.0 | 25.84 | 125.84 | 39.0 | 10.0 | 2.0 | Grade A |
| 45 | SPD Smith | 362.0 | 40.22 | 135.58 | 24.0 | 14.0 | 0.0 | Grade A |
| 47 | TM Dilshan | 285.0 | 35.62 | 109.19 | 33.0 | 5.0 | 3.0 | Grade A |
| 49 | RV Uthappa | 405.0 | 27.00 | 118.07 | 38.0 | 10.0 | 2.0 | Grade A |
| 51 | SE Marsh | 336.0 | 30.54 | 120.00 | 39.0 | 7.0 | 2.0 | Grade A |
| 53 | KA Pollard | 220.0 | 24.44 | 138.36 | 15.0 | 14.0 | 2.0 | Grade A |
| 55 | DMD Jayawardene | 335.0 | 27.91 | 112.41 | 39.0 | 3.0 | 3.0 | Grade A |
| 57 | V Kohli | 364.0 | 28.00 | 111.65 | 33.0 | 9.0 | 2.0 | Grade A |
| 59 | MA Agarwal | 225.0 | 20.45 | 142.40 | 19.0 | 15.0 | 1.0 | Grade A |
| 61 | SR Tendulkar | 324.0 | 29.45 | 114.48 | 39.0 | 4.0 | 2.0 | Grade A |
| 63 | MEK Hussey | 261.0 | 32.62 | 110.59 | 28.0 | 8.0 | 2.0 | Grade A |
| 65 | JH Kallis | 409.0 | 25.56 | 106.51 | 34.0 | 10.0 | 2.0 | Grade A |
| 67 | MS Dhoni | 357.0 | 29.75 | 128.41 | 26.0 | 9.0 | 1.0 | Grade A |
| 71 | JD Ryder | 256.0 | 25.60 | 120.75 | 23.0 | 8.0 | 2.0 | Grade A |
#list of Grade B players
Grade_B = bb[bb['cluster'] == 'Grade B']
Grade_B
| Name | Runs | Ave | SR | Fours | Sixes | HF | cluster | |
|---|---|---|---|---|---|---|---|---|
| 69 | MS Bisla | 213.0 | 30.42 | 133.12 | 16.0 | 10.0 | 1.0 | Grade B |
| 73 | BJ Hodge | 245.0 | 30.62 | 140.00 | 18.0 | 9.0 | 0.0 | Grade B |
| 75 | NV Ojha | 255.0 | 23.18 | 113.83 | 21.0 | 13.0 | 1.0 | Grade B |
| 77 | DB Das | 126.0 | 42.00 | 135.48 | 9.0 | 6.0 | 0.0 | Grade B |
| 79 | AC Gilchrist | 172.0 | 34.40 | 120.27 | 21.0 | 4.0 | 1.0 | Grade B |
| 81 | BB McCullum | 289.0 | 24.08 | 102.12 | 37.0 | 3.0 | 1.0 | Grade B |
| 83 | IK Pathan | 176.0 | 25.14 | 139.68 | 14.0 | 6.0 | 0.0 | Grade B |
| 85 | Azhar Mahmood | 186.0 | 23.25 | 130.98 | 16.0 | 8.0 | 0.0 | Grade B |
| 87 | MK Pandey | 143.0 | 20.42 | 127.67 | 12.0 | 6.0 | 1.0 | Grade B |
| 89 | S Badrinath | 196.0 | 28.00 | 108.28 | 23.0 | 2.0 | 1.0 | Grade B |
| 91 | DA Miller | 98.0 | 32.66 | 130.66 | 6.0 | 4.0 | 0.0 | Grade B |
| 93 | MK Tiwary | 260.0 | 26.00 | 105.69 | 21.0 | 3.0 | 1.0 | Grade B |
| 95 | JA Morkel | 107.0 | 15.28 | 157.35 | 5.0 | 6.0 | 0.0 | Grade B |
| 97 | LRPL Taylor | 197.0 | 19.70 | 115.20 | 12.0 | 7.0 | 1.0 | Grade B |
| 99 | M Manhas | 120.0 | 30.00 | 125.00 | 10.0 | 4.0 | 0.0 | Grade B |
| 101 | DT Christian | 145.0 | 29.00 | 122.88 | 8.0 | 6.0 | 0.0 | Grade B |
| 103 | RA Jadeja | 191.0 | 15.91 | 126.49 | 13.0 | 9.0 | 0.0 | Grade B |
| 105 | JEC Franklin | 220.0 | 24.44 | 98.65 | 15.0 | 6.0 | 1.0 | Grade B |
| 107 | KC Sangakkara | 200.0 | 18.18 | 108.69 | 21.0 | 4.0 | 1.0 | Grade B |
| 109 | Y Nagar | 153.0 | 30.60 | 115.03 | 13.0 | 3.0 | 0.0 | Grade B |
| 111 | STR Binny | 90.0 | 22.50 | 134.32 | 9.0 | 3.0 | 0.0 | Grade B |
| 113 | SS Tiwary | 191.0 | 23.87 | 112.35 | 9.0 | 8.0 | 0.0 | Grade B |
| 115 | KD Karthik | 238.0 | 18.30 | 111.73 | 30.0 | 2.0 | 0.0 | Grade B |
| 117 | AL Menaria | 220.0 | 20.00 | 108.91 | 14.0 | 8.0 | 0.0 | Grade B |
| 119 | PA Patel | 194.0 | 17.63 | 117.57 | 19.0 | 4.0 | 0.0 | Grade B |
| 121 | SC Ganguly | 268.0 | 17.86 | 98.89 | 30.0 | 4.0 | 0.0 | Grade B |
| 123 | YK Pathan | 194.0 | 19.40 | 114.79 | 10.0 | 7.0 | 0.0 | Grade B |
| 125 | Harbhajan Singh | 108.0 | 12.00 | 135.00 | 14.0 | 3.0 | 0.0 | Grade B |
| 127 | RE Levi | 83.0 | 13.83 | 113.69 | 10.0 | 4.0 | 1.0 | Grade B |
| 129 | LR Shukla | 75.0 | 12.50 | 131.57 | 4.0 | 5.0 | 0.0 | Grade B |
| 131 | Y Venugopal Rao | 132.0 | 22.00 | 104.76 | 8.0 | 5.0 | 0.0 | Grade B |
| 133 | AD Mathews | 127.0 | 18.14 | 117.59 | 5.0 | 4.0 | 0.0 | Grade B |
| 135 | PP Chawla | 106.0 | 13.25 | 120.45 | 9.0 | 4.0 | 0.0 | Grade B |
| 137 | Shakib Al Hasan | 91.0 | 15.16 | 122.97 | 6.0 | 3.0 | 0.0 | Grade B |
| 139 | N Saini | 140.0 | 14.00 | 99.29 | 16.0 | 0.0 | 1.0 | Grade B |
| 141 | MN Samuels | 124.0 | 17.71 | 100.81 | 7.0 | 5.0 | 0.0 | Grade B |
| 143 | MJ Clarke | 98.0 | 16.33 | 104.25 | 12.0 | 0.0 | 0.0 | Grade B |
| 145 | R Bhatia | 35.0 | 11.66 | 125.00 | 4.0 | 0.0 | 0.0 | Grade B |
| 147 | R Vinay Kumar | 68.0 | 13.60 | 109.67 | 3.0 | 2.0 | 0.0 | Grade B |
| 149 | P Kumar | 35.0 | 11.66 | 116.66 | 2.0 | 1.0 | 0.0 | Grade B |
| 151 | J Botha | 58.0 | 14.50 | 107.40 | 4.0 | 1.0 | 0.0 | Grade B |
| 153 | A Ashish Reddy | 35.0 | 8.75 | 120.68 | 3.0 | 1.0 | 0.0 | Grade B |
| 155 | DL Vettori | 31.0 | 7.75 | 119.23 | 3.0 | 1.0 | 0.0 | Grade B |
| 157 | SP Goswami | 69.0 | 13.80 | 102.98 | 4.0 | 1.0 | 0.0 | Grade B |
| 159 | SL Malinga | 55.0 | 9.16 | 103.77 | 4.0 | 3.0 | 0.0 | Grade B |
| 161 | RJ Peterson | 32.0 | 10.66 | 106.66 | 3.0 | 1.0 | 0.0 | Grade B |
| 163 | R Ashwin | 18.0 | 6.00 | 120.00 | 2.0 | 0.0 | 0.0 | Grade B |
| 165 | B Kumar | 40.0 | 13.33 | 100.00 | 4.0 | 0.0 | 0.0 | Grade B |
| 167 | DW Steyn | 19.0 | 3.80 | 90.47 | 0.0 | 1.0 | 0.0 | Grade B |
| 169 | A Mishra | 16.0 | 5.33 | 80.00 | 1.0 | 0.0 | 0.0 | Grade B |
| 171 | Z Khan | 12.0 | 6.00 | 70.58 | 1.0 | 0.0 | 0.0 | Grade B |
| 173 | WD Parnell | 19.0 | 4.75 | 70.37 | 2.0 | 0.0 | 0.0 | Grade B |
| 175 | PC Valthaty | 30.0 | 5.00 | 58.82 | 4.0 | 0.0 | 0.0 | Grade B |
| 177 | RP Singh | 6.0 | 3.00 | 50.00 | 0.0 | 0.0 | 0.0 | Grade B |
| 179 | R Sharma | 2.0 | 0.50 | 18.18 | 0.0 | 0.0 | 0.0 | Grade B |
1. List down all possible dimensionality reduction techniques that can be implemented using python.
Dimensionality Reduction Tehcniques can be classified into 3 types:
Feature selection:
a) Missing Value Ratio: If the dataset has too many missing values, we use this approach to reduce the number of variables. We can drop the variables having a large number of missing values in them.
b) Low Variance filter: We apply this approach to identify and drop constant variables from the dataset. The target variable is not unduly affected by variables with low variance, and hence these variables can be safely dropped.
c) High Correlation filter: A pair of variables having high correlation increases multicollinearity in the dataset. So, we can use this technique to find highly correlated features and drop them accordingly.
d) Random Forest: This is one of the most commonly used techniques which tells us the importance of each feature present in the dataset. We can find the importance of each feature and keep the top most features, resulting in dimensionality reduction.
e) Both Backward Feature Elimination and Forward Feature Selection techniques take a lot of computational time and are thus generally used on smaller datasets.
Components / Factor Based:
a) Factor Analysis: This technique is best suited for situations where we have highly correlated set of variables. It divides the variables based on their correlation into different groups, and represents each group with a factor.
b) Principal Component Analysis: This is one of the most widely used techniques for dealing with linear data. It divides the data into a set of components which try to explain as much variance as possible.
c) Independent Component Analysis: We can use ICA to transform the data into independent components which describe the data using less number of components.
Projection Based:
a) ISOMAP: We use this technique when the data is strongly non-linear.
b) t-SNE: This technique also works well when the data is strongly non-linear. It works extremely well for visualizations as well.
c) UMAP: This technique works well for high dimensional data. Its run-time is shorter as compared to t-SNE.
2. So far you have used dimensional reduction on numeric data. Is it possible to do the same on a multimedia data and text data ? Please illustrate your findings using a simple implementation on python.
Yes, Dimensionality reduction techniques can work for Multimedia data as well as text data.
Exploring Handwritten Digits
from sklearn.datasets import load_digits
digits = load_digits()
digits.images.shape
(1797, 8, 8)
The images data is a three-dimensional array: 1,797 samples, each consisting of an 8×8 grid of pixels.
# visualize the first 50 of these
import matplotlib.pyplot as plt
fig, axes = plt.subplots(5, 5, figsize=(8, 8),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i,ax in enumerate(axes.flat):
ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
ax.text(0.05, 0.05, str(digits.target[i]),
transform=ax.transAxes, color='green')
#two-dimensional,[n_samples, n_features] representation
X = digits.data
X.shape
(1797, 64)
y = digits.target
y.shape
(1797,)
1,797 samples and 64 features.
#transform the data to two dimensions using manifold learning algorithm called Isomap
from sklearn.manifold import Isomap
iso = Isomap(n_components=2)
iso.fit(digits.data)
data_projected = iso.transform(digits.data)
data_projected.shape
(1797, 2)
# plot this data to see if we can learn anything from its structure
plt.figure(figsize=(10, 8))
plt.scatter(data_projected[:, 0], data_projected[:, 1], c=digits.target,edgecolor='none', alpha=0.5,cmap=plt.cm.get_cmap('magma_r', 10))
plt.colorbar(label='digit label', ticks=range(10))
plt.clim(-0.5, 9.5);
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
#fit a Gaussian naive Bayes model
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
pred = model.predict(X_test)
# gauge its accuracy by comparing the true values of the test set to the predictions
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)
0.8407407407407408
#visualization of confusion matrix in the form of a heatmap
plt.figure(figsize = (12, 8))
mat = confusion_matrix(y_test, pred)
sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('predicted value')
plt.ylabel('Actual value');
plt.title('Confusion Matrix HeatMap', fontsize = 15);
Another way to gain intuition into the characteristics of the model is to plot the inputs again, with their predicted labels. We’ll use green for correct labels, and red for incorrect labels
fig, axes = plt.subplots(10, 10, figsize=(8, 8),
subplot_kw={'xticks':[], 'yticks':[]},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(digits.images[i], cmap='binary', interpolation='nearest')
ax.text(0.05, 0.05, str(pred[i]),
transform=ax.transAxes,
color='green' if (y_test[i] == pred[i]) else 'red')